1 Background

Aim: study the distribution of the annotated junction categories for specific ataxia related genes.

Based on the analysis for STAR annotated introns, this report is focused on five causative Mendelian genes for Ataxia: ATXN1, ATXN2, ATXN7, CACNA1A and FXN. The results will be split by Ataxia diagnosis: Control, SCA1, SCA2, SCA6 and FRDA.

# Variables
ataxia_genes <- c("FXN", "ATXN1", "ATXN2", "ATXN7", "CACNA1A")

# Paths
metadata_path <- file.path(main_path, "metadata/metadata.csv")
multiqc_path <- file.path(main_path, "metadata/multiqc_rseqc_read_distribution.txt")

path_to_bam_files <- file.path("/home/grocamora/RytenLab-Research/11-Ataxia_bulk/Zhongbo_Script/all_filtered_BAM/")
path_to_junc_files <- file.path("/home/grocamora/RytenLab-Research/11-Ataxia_bulk/Zhongbo_Script/all_filtered_JUNC/")

# Require software paths
samtools_path <- file.path("/home/grocamora/tools/samtools/bin/")
regtools_path <- file.path("/home/grocamora/tools/regtools/build/")

# Path to the gtf file to annotate the introns and the ENCODE blacklisted region
gtf_ensembl_path <- file.path("/home/grocamora/RytenLab-Research/Additional_files/Homo_sapiens.GRCh38.105.gtf")
gtf_gencode_path <- file.path("/home/grocamora/RytenLab-Research/Additional_files/GENCODE/gencode.v38.annotation.gtf")
blacklist_path = file.path("/home/grocamora/RytenLab-Research/Additional_files/hg38-blacklist.v2.bed")

## Load the metadata
metadata <- readr::read_delim(metadata_path, show_col_types = FALSE) %>% 
  dplyr::rowwise() %>%
  dplyr::mutate(Individual_ID = stringr::str_split(ID_anon, "_", simplify = T)[1]) %>%
  dplyr::filter(!(Diagnosis %in% c("CANVAS", "AIFM1")))

Changelog

v1.3

  • Changed the reference transcriptome to Gencode v38.

v1.2

  • Changed the reference transcriptome to Gencode v39.

2 Pipeline

2.1 Loading the junctions in samples

The first step of the pipeline is to extract the junctions from the studied BAM files. The process is split in two steps:

  1. Download and extraction of BAM files: the files are downloaded from s3://ataxia-bulk-rnaseq/nextflow_first_attemp/Star_2_pass_by_indv/output/STAR/align/BAM_files/ and subfolders. The BAM files are then filtered to extract the information from the relevant genes studied in this report.

  2. Junction extraction: all junctions are extracted using regtools junction extract after sorting and indexing with samtools. A file is created for each BAM file in BED12 format.

dir.create(path_to_junc_files, showWarnings = F)
#doParallel::registerDoParallel(8) # Whether to use multiprocessing

junc_files_df <- foreach(i = seq_len(nrow(metadata)), .packages = c("tidyverse")) %dopar%{
  row = metadata[i, ]
  sample_name = row$Correct_sample
  sample_diagnosis = row$Diagnosis
  
  sample_files_df <- foreach(j = seq_along(ataxia_genes)) %do%{
    gene_name <- ataxia_genes[j]
    
    bam_path <- paste0(path_to_bam_files, sample_name, "_", sample_diagnosis, "_", gene_name, ".bam")
    sort_path <- paste0(bam_path, ".sort")
    junc_path <- paste0(path_to_junc_files, sample_name, "_", sample_diagnosis, "_", gene_name, ".bam.sort.s0.junc")
    
    # Execute the extraction pipeline if the junc file does not exists
    if(!file.exists(junc_path)){
      # Sort the BAM file (samtools)
      system2(command = paste0(samtools_path, "samtools"), args = c(
        "sort", bam_path,
        "-o", sort_path
      ))
  
      # Index the BAM file (samtools)
      system2(command = paste0(samtools_path, "samtools"), args = c(
        "index", sort_path
      ))
  
      # Extract the juntions (regtools)
      system2(command = paste0(regtools_path, "regtools"), args = c(
        "junctions extract", sort_path,
        "-m 25",
        "-M 1000000",
        "-s 0",
        "-o", junc_path
      ))
  
      # Remove side products
      system2(command = "rm", args = c(sort_path))
      system2(command = "rm", args = c(paste0(sort_path, ".bai")))
    }
    
    # Return information about the junc file created
    return(tibble::tibble(sample_name = sample_name,
                          gene_name = gene_name,
                          junc_file_exists = file.exists(junc_path),
                          junc_file_path = junc_path))
  } %>% dplyr::bind_rows()
} %>% dplyr::bind_rows()
  1. Once the junctions are extracted, load them into a single table where the reads per sample are registered.
#doParallel::registerDoParallel(8) # Whether to use multiprocessing
all_junc <- foreach(i = seq_len(nrow(metadata)), .packages = c("tidyverse")) %dopar%{
  row = metadata[i, ]
  sample_id <- row$ID_anon
  sample_name = row$Correct_sample
  sample_diagnosis = row$Diagnosis
  
  sample_junctions <- foreach(j = seq_along(ataxia_genes)) %do%{
    gene_name <- ataxia_genes[j]
    junc_path <- paste0(path_to_junc_files, sample_name, "_", sample_diagnosis, "_", gene_name, ".bam.sort.s0.junc")
    
    # If file not found, return empty tibble
    if(!file.exists(junc_path)) return(tibble())
    
    # Read the junction file
    junc <- readr::read_table(junc_path, col_names = F, col_types = "cddcdcddcdcc", 
                              progress = F, locale = readr::locale(grouping_mark = ""))
    
    # Transformations of the junctions
    junc <- junc %>%
      dplyr::select(chr = X1, start = X2, stop = X3, junID = X4, reads = X5, strand = X6, blockSizes = X11 ) %>%
      dplyr::mutate(strand = ifelse(strand == "?", "*", strand)) %>%
      dplyr::mutate(sample_id = sample_id,
                    gene_name = gene_name) %>%
      tidyr::separate(col = blockSizes, sep = ",", c("blockSizesStart", "blockSizesEnd"), conver = T) %>%
      dplyr::mutate(start = start + blockSizesStart + 1, stop = stop - blockSizesEnd) %>%
      GenomicRanges::GRanges() %>%
      diffloop::rmchr() %>%
      tibble::as_tibble() %>%
      dplyr::mutate(seqnames = as.character(seqnames),
                    strand = as.character(strand)) %>%
      dplyr::select(-junID, -blockSizesStart, -blockSizesEnd)
    
    return(junc)
  } %>% dplyr::bind_rows()
} %>% dplyr::bind_rows()

# Unique ID for each junction and generate the reads table
all_reads_combined <- all_junc %>%
  dplyr::group_by(seqnames, start, end) %>%
  dplyr::mutate(junID = dplyr::cur_group_id(), .before = seqnames) %>%
  dplyr::ungroup() %>%
  tidyr::pivot_wider(values_from = reads, names_from = c(sample_id, gene_name)) %>%
  replace(is.na(.), 0)

2.2 Junction annotation

Junctions are first filtered: (i) removing all junctions that overlap with the ENCODE blacklisted region and (ii) removing all junctions that are less than 25 bp long. Then, junctions are annotated using dasper function junction_annot and a reference transcriptome (GENCODE v38 in this report).

Additionally, all junctions associated to more than one gene are removed as they are considered ambiguous.

# The annotation process takes some time, so execute only if the output is not found on disk.
gene_specific_annotated_path <- file.path(main_path, "variables/gene_specific_annotated.rds")

if(!file.exists(gene_specific_annotated_path)){
  # Convert the junctions to GR object
  all_junc_gr <- all_reads_combined %>%
    dplyr::select(junID, seqnames, start, end, width, strand) %>%
    GenomicRanges::GRanges()
  
  # Filter by ENCODE blacklisted region
  encode_blacklist_hg38 <- loadEncodeBlacklist(blacklist_path)
  all_junc_gr <- removeEncodeBlacklistRegions(all_junc_gr, encode_blacklist_hg38)
  
  # Filter junctions < 25bp long
  all_junc_gr <- all_junc_gr[which(width(all_junc_gr) >= 25), ]
  
  # Load the reference transcriptome. Some reference transcriptomes (i.e. Gencode)
  # use the characters "chr" before the seqnames. To bring consistency, the
  # "seqlevelsStyle" is set to "NCBI".
  TxDb_ref <- dasper:::ref_load(gtf_gencode_path) %>% `seqlevelsStyle<-`("NCBI")
  
  # Annotate the junctions
  annot_junc_gr <- all_junc_gr %>% dasper::junction_annot(ref = TxDb_ref)
  
  # Filter ambiguous genes
  annot_junc_gr <- annot_junc_gr[which(sapply(annot_junc_gr$gene_id_junction, length) < 2), ]
  
  # Remove unnecesary columns
  annot_junc <- annot_junc_gr %>% 
    tibble::as_tibble() %>%
    dplyr::select(junID, seqnames, start, end, width, strand, type)
  
  annot_junc %>% saveRDS(gene_specific_annotated_path)
}else{
  annot_junc <- readRDS(gene_specific_annotated_path)
}
## [1] "2023-06-05 08:10:58 - Importing gtf/gff3 as a TxDb..."
## [1] "2023-06-05 08:12:27 - Obtaining co-ordinates of annotated exons and junctions..."
## [1] "2023-06-05 08:12:41 - Getting junction annotation using overlapping exons..."
## [1] "2023-06-05 08:12:42 - Tidying junction annotation..."
## [1] "2023-06-05 08:12:42 - Deriving junction categories..."
## [1] "2023-06-05 08:12:42 - done!"

Lastly, we combine the reads and the annotation category into a single table. An example of the final table:

annot_junc_df <- annot_junc %>% 
  dplyr::left_join(all_reads_combined %>% dplyr::select(-c(seqnames, start, end, width, strand)), 
                   by = "junID") %>%
  dplyr::select(-c(seqnames, start, end, width, strand)) %>%
  tidyr::pivot_longer(cols = -c(junID, type), names_to = "sample_id", values_to = "count") %>%
  dplyr::filter(count > 0) %>%
  dplyr::mutate(gene_name = gsub("^.*_", "", sample_id),
                sample_id = gsub("_[a-zA-Z1-9]*$", "", sample_id),
                .before = count) 

annot_junc_df %>%
  dplyr::sample_n(10) %>%
  kableExtra::kbl(booktabs = T, linesep = "") %>%
  kableExtra::kable_classic(full_width = T, "hover", "striped", html_font = "Cambria", font_size = 14) %>%
  kableExtra::row_spec(0, bold = T, font_size = 16)
junID type sample_id gene_name count
570 annotated CO17B_Cerebellum CACNA1A 54
1085 novel_exon_skip C23B_Frontal CACNA1A 6
1858 unannotated CO20B_Frontal FXN 2
486 novel_acceptor CO3A_Cerebellum CACNA1A 3
1797 annotated C22B_Frontal ATXN1 9
202 annotated CO9A_Cerebellum ATXN2 78
236 novel_exon_skip CO14B_Frontal ATXN2 2
863 novel_acceptor C24B_Cerebellum CACNA1A 3
1281 unannotated C19B_Frontal CACNA1A 4
1084 novel_donor C25B_Frontal CACNA1A 1

3 Overview of the junctions

To better understand the results presented in the report, it is important to consider the number of junctions associated to each gene. The following graph represents the number of junctions associated to each of the studies genes. Each point represent a sample from where the junctions were extracted.

Number of junctions associated to each of the studied genes. The low number of junctions for gene FXN explain the inconsistent results found in later sections.

Figure 3.1: Number of junctions associated to each of the studied genes. The low number of junctions for gene FXN explain the inconsistent results found in later sections.

4 Visualization of all junction categories

The first plots shown here represent the distributions using all categories from the dasper annotation function.

Across all visualizations, a Wilcoxon signed-rank test between the distributions was executed, and significant results (\(p<0.05\)) are shown in the graphs. The p-values are Bonferroni corrected given the number of tests executed per group.

4.1 Proportion of junctions by category

First, we plot the category percentages by sample. We employ the function plotJunctionCategories. Source.

Level 3 (Diagnosis)

All genes

Proportion of junctions by Diagnosis and tissue. Results presented for all junctions found within the studied genes.

Figure 4.1: Proportion of junctions by Diagnosis and tissue. Results presented for all junctions found within the studied genes.

ATXN1

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN1 gene.

Figure 4.2: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN1 gene.

ATXN2

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN2 gene.

Figure 4.3: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN2 gene.

ATXN7

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN7 gene.

Figure 4.4: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN7 gene.

CACNA1A

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the CACNA1A gene.

Figure 4.5: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the CACNA1A gene.

FXN

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the FXN gene.

Figure 4.6: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the FXN gene.

4.2 Proportion of counts by category

Next, we also represent the proportion of counts (or reads) that correspond to each category when compared against the total number of junction reads. We use the function plotJunctionCountCategories.

Level 3 (Diagnosis)

All genes

Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Cerebellum brain region.

Figure 4.7: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Frontal Cortex brain region.

Figure 4.8: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Frontal Cortex brain region.

ATXN1

Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Cerebellum brain region.

Figure 4.9: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Frontal Cortex brain region.

Figure 4.10: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Frontal Cortex brain region.

ATXN2

Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Cerebellum brain region.

Figure 4.11: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Frontal Cortex brain region.

Figure 4.12: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Frontal Cortex brain region.

ATXN7

Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Cerebellum brain region.

Figure 4.13: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Frontal Cortex brain region.

Figure 4.14: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Frontal Cortex brain region.

CACNA1A

Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Cerebellum brain region.

Figure 4.15: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Frontal Cortex brain region.

Figure 4.16: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Frontal Cortex brain region.

FXN

Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Cerebellum brain region.

Figure 4.17: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Frontal Cortex brain region.

Figure 4.18: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Frontal Cortex brain region.

5 Visualization for three categories

Lastly, the same visualizations are presented but using only three categories: Annotated, Partially annotated and Unannotated.

5.1 Proportion of junctions by category

Level 3 (Diagnosis)

All genes

Proportion of junctions by Diagnosis and tissue. Results presented for all junctions found within the studied genes.

Figure 5.1: Proportion of junctions by Diagnosis and tissue. Results presented for all junctions found within the studied genes.

ATXN1

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN1 gene.

Figure 5.2: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN1 gene.

ATXN2

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN2 gene.

Figure 5.3: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN2 gene.

ATXN7

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN7 gene.

Figure 5.4: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN7 gene.

CACNA1A

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the CACNA1A gene.

Figure 5.5: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the CACNA1A gene.

FXN

Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the FXN gene.

Figure 5.6: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the FXN gene.

5.2 Proportion of counts by category

Level 3 (Diagnosis)

All genes

Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Cerebellum brain region.

Figure 5.7: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Frontal Cortex brain region.

Figure 5.8: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Frontal Cortex brain region.

ATXN1

Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Cerebellum brain region.

Figure 5.9: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Frontal Cortex brain region.

Figure 5.10: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Frontal Cortex brain region.

ATXN2

Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Cerebellum brain region.

Figure 5.11: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Frontal Cortex brain region.

Figure 5.12: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Frontal Cortex brain region.

ATXN7

Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Cerebellum brain region.

Figure 5.13: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Frontal Cortex brain region.

Figure 5.14: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Frontal Cortex brain region.

CACNA1A

Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Cerebellum brain region.

Figure 5.15: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Frontal Cortex brain region.

Figure 5.16: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Frontal Cortex brain region.

FXN

Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Cerebellum brain region.

Figure 5.17: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Cerebellum brain region.

Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Frontal Cortex brain region.

Figure 5.18: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Frontal Cortex brain region.